Prepare libraries

library(tidyverse)
library(data.table)
library(gdata)
library(CLIPanalyze)
library(BSgenome)
library(VennDiagram)
library(ggplot2)
library(biomaRt)
library(RColorBrewer)
library(plotly)

Load RNA-Seq data

dir.create("PDF_figure/CDF_zscore_merged", showWarnings = FALSE)

DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon tumor/4-OHT enema model/Analysis/Differential Analysis_filtered.csv")
## New names:
## * `` -> ...1
## Rows: 11143 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(DGE)[1] <- "EnsemblID"

Load proteomics data

DGE_pro <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model/crcMS_diff.csv")
## New names:
## * `` -> ...1
## Rows: 8185 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Protein Id, Description
## dbl (5): ...1, p_values, q_values, foldChange, LFC
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(DGE_pro)[2] <- "UniprotID"

Load miR-target list

mir.targets <- readRDS("Datafiles/miRNA-merged-peaks-list-09282019-withIDs.rds")

Z-score calculation

RNA

Compute GSEA z-scores using the PAGE method

zscore.cal <- function(genes = NA, index = NA, expression.dataset){
  if (sum(!is.na(genes)) > 0 && is.na(index)){
    gene.set <- expression.dataset[expression.dataset$EnsemblID %in% genes,]
  } else if (!is.na(index)) {
    gene.set <- expression.dataset[index,]
  } else {
    print("Please input gene names or row index.")
  }
  mu <- mean(expression.dataset$log2FoldChange)
  Sm <- mean(gene.set$log2FoldChange)
  m <- length(genes)
  SD <- sd(expression.dataset$log2FoldChange)
  z <- (Sm-mu)*sqrt(m)/SD
  return(z)
}
zscores.all <- as.data.frame(sapply(mir.targets,
                              function(targets){
                                  zscore.cal(genes = targets$target_Ensembl_ID, expression.dataset = DGE)
                            }))
colnames(zscores.all) <- c("z.score")
lens <- as.data.frame(sapply(mir.targets, function(x) length(x)))
colnames(lens) <- "N"
zscores.all <- cbind(zscores.all, lens$N)

mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-09282019.rds")
zscores.all <- as_tibble(zscores.all, rownames = "miR.family")
zscores.all <- inner_join(zscores.all, mirna.family.DGE[,c(1,7)], by = c("miR.family" = "miR.family"))
colnames(zscores.all)[3] <- "N"
p_rna <- ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N, label = miR.family)) +
  geom_point(colour = "#EC469A", alpha = 0.6) +
  xlab("RNA-Seq Z-score") +
  ylab("miRNA family abundance") +
  theme_bw() +
      theme(panel.border = element_rect(),
      panel.background = element_blank(),
      panel.grid.major = element_line(), 
      panel.grid.minor = element_line(),
      axis.title.x = element_text(size=14, margin = margin(t = 10)),
      axis.title.y = element_text(size=14, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_line(size = 0.5),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

ggplotly(p_rna)
pdf("PDF_figure/CDF_zscore_merged/RNA_CDF_zscore.pdf",
    height = 4,
    width = 6)
ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N)) +
  geom_point(colour = "#EC469A", alpha = 0.6) +
  xlab("RNA-Seq Z-score") +
  ylab("miRNA family abundance") +
  theme_bw() +
      theme(panel.border = element_rect(),
      panel.background = element_blank(),
      panel.grid.major = element_line(), 
      panel.grid.minor = element_line(),
      axis.title.x = element_text(size=14, margin = margin(t = 10)),
      axis.title.y = element_text(size=14, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_line(size = 0.5),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
dev.off()
## quartz_off_screen 
##                 2
cor.test(zscores.all$z.score, log2(zscores.all$baseMean))
## 
##  Pearson's product-moment correlation
## 
## data:  zscores.all$z.score and log2(zscores.all$baseMean)
## t = 2.7268, df = 79, p-value = 0.007875
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.08008006 0.48085539
## sample estimates:
##      cor 
## 0.293301

Protein

Compute GSEA z-scores using the PAGE method

zscore.cal <- function(genes = NA, index = NA, expression.dataset){
  if (sum(!is.na(genes)) > 0 && is.na(index)){
    gene.set <- expression.dataset[expression.dataset$UniprotID %in% genes,]
  } else if (!is.na(index)) {
    gene.set <- expression.dataset[index,]
  } else {
    print("Please input gene names or row index.")
  }
  mu <- mean(expression.dataset$LFC)
  Sm <- mean(gene.set$LFC)
  m <- length(genes)
  SD <- sd(expression.dataset$LFC)
  z <- (Sm-mu)*sqrt(m)/SD
  return(z)
}
zscores.all <- as.data.frame(sapply(mir.targets,
                              function(targets){
                                  zscore.cal(genes = targets$target_Uniprot_ID, expression.dataset = DGE_pro)
                            }))
colnames(zscores.all) <- c("z.score")
lens <- as.data.frame(sapply(mir.targets, function(x) length(x)))
colnames(lens) <- "N"
zscores.all <- cbind(zscores.all, lens$N)
zscores.all <- as_tibble(zscores.all, rownames = "miR.family")
zscores.all <- inner_join(zscores.all, mirna.family.DGE[,c(1,7)], by = c("miR.family" = "miR.family"))
colnames(zscores.all)[3] <- "N"
p_protein <- ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N, label = miR.family)) +
  geom_point(colour = "#1C75BB", alpha = 0.6) +
  xlab("Proteomics Z-score") +
  ylab("miRNA family abundance") +
  theme_bw() +
      theme(panel.border = element_rect(),
      panel.background = element_blank(),
      panel.grid.major = element_line(), 
      panel.grid.minor = element_line(),
      axis.title.x = element_text(size=14, margin = margin(t = 10)),
      axis.title.y = element_text(size=14, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_line(size = 0.5),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

ggplotly(p_protein)
pdf("PDF_figure/CDF_zscore_merged/Protein_CDF_zscore.pdf",
    height = 4,
    width = 6)
ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N)) +
  geom_point(colour = "#1C75BB", alpha = 0.6) +
  xlab("Proteomics Z-score") +
  ylab("miRNA family abundance") +
  theme_bw() +
      theme(panel.border = element_rect(),
      panel.background = element_blank(),
      panel.grid.major = element_line(), 
      panel.grid.minor = element_line(),
      axis.title.x = element_text(size=14, margin = margin(t = 10)),
      axis.title.y = element_text(size=14, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_blank(),
      axis.line.x = element_blank(),
      axis.ticks.x = element_line(size = 0.5),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
dev.off()
## quartz_off_screen 
##                 2
cor.test(zscores.all$z.score, log2(zscores.all$baseMean))
## 
##  Pearson's product-moment correlation
## 
## data:  zscores.all$z.score and log2(zscores.all$baseMean)
## t = 3.5366, df = 79, p-value = 0.0006816
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1646528 0.5441340
## sample estimates:
##       cor 
## 0.3697104

SessionInfo

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.10.0               RColorBrewer_1.1-2         
##  [3] biomaRt_2.48.3              VennDiagram_1.7.0          
##  [5] futile.logger_1.4.3         BSgenome_1.60.0            
##  [7] rtracklayer_1.52.1          CLIPanalyze_0.0.10         
##  [9] DESeq2_1.32.0               GenomicAlignments_1.28.0   
## [11] Rsamtools_2.8.0             Biostrings_2.60.2          
## [13] XVector_0.32.0              SummarizedExperiment_1.22.0
## [15] MatrixGenerics_1.4.3        matrixStats_0.61.0         
## [17] GenomicFeatures_1.44.2      AnnotationDbi_1.54.1       
## [19] Biobase_2.52.0              GenomicRanges_1.44.0       
## [21] GenomeInfoDb_1.28.4         IRanges_2.26.0             
## [23] S4Vectors_0.30.2            BiocGenerics_0.38.0        
## [25] plyr_1.8.6                  gdata_2.18.0               
## [27] data.table_1.14.2           forcats_0.5.1              
## [29] stringr_1.4.0               dplyr_1.0.7                
## [31] purrr_0.3.4                 readr_2.1.0                
## [33] tidyr_1.1.4                 tibble_3.1.6               
## [35] ggplot2_3.3.5               tidyverse_1.3.1            
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-2       rjson_0.2.20           ellipsis_0.3.2        
##  [4] fs_1.5.0               rstudioapi_0.13        farver_2.1.0          
##  [7] bit64_4.0.5            fansi_0.5.0            lubridate_1.8.0       
## [10] xml2_1.3.2             splines_4.1.1          cachem_1.0.6          
## [13] geneplotter_1.70.0     knitr_1.36             jsonlite_1.7.2        
## [16] broom_0.7.10           annotate_1.70.0        dbplyr_2.1.1          
## [19] png_0.1-7              compiler_4.1.1         httr_1.4.2            
## [22] biosignals_0.1.0       backports_1.4.0        lazyeval_0.2.2        
## [25] assertthat_0.2.1       Matrix_1.3-4           fastmap_1.1.0         
## [28] cli_3.1.0              formatR_1.11           htmltools_0.5.2       
## [31] prettyunits_1.1.1      tools_4.1.1            gtable_0.3.0          
## [34] glue_1.5.0             GenomeInfoDbData_1.2.6 rappdirs_0.3.3        
## [37] Rcpp_1.0.7             cellranger_1.1.0       jquerylib_0.1.4       
## [40] vctrs_0.3.8            crosstalk_1.2.0        xfun_0.28             
## [43] rvest_1.0.2            lifecycle_1.0.1        restfulr_0.0.13       
## [46] gtools_3.9.2           XML_3.99-0.8           zlibbioc_1.38.0       
## [49] scales_1.1.1           vroom_1.5.6            hms_1.1.1             
## [52] lambda.r_1.2.4         yaml_2.2.1             curl_4.3.2            
## [55] memoise_2.0.1          stringi_1.7.5          RSQLite_2.2.8         
## [58] genefilter_1.74.1      BiocIO_1.2.0           filelock_1.0.2        
## [61] BiocParallel_1.26.2    rlang_0.4.12           pkgconfig_2.0.3       
## [64] bitops_1.0-7           evaluate_0.14          lattice_0.20-45       
## [67] labeling_0.4.2         htmlwidgets_1.5.4      bit_4.0.4             
## [70] tidyselect_1.1.1       magrittr_2.0.1         R6_2.5.1              
## [73] generics_0.1.1         DelayedArray_0.18.0    DBI_1.1.1             
## [76] pillar_1.6.4           haven_2.4.3            withr_2.4.2           
## [79] survival_3.2-13        KEGGREST_1.32.0        RCurl_1.98-1.5        
## [82] modelr_0.1.8           crayon_1.4.2           futile.options_1.0.1  
## [85] utf8_1.2.2             BiocFileCache_2.0.0    tzdb_0.2.0            
## [88] rmarkdown_2.11         progress_1.2.2         locfit_1.5-9.4        
## [91] readxl_1.3.1           blob_1.2.2             reprex_2.0.1          
## [94] digest_0.6.28          xtable_1.8-4           munsell_0.5.0         
## [97] viridisLite_0.4.0